{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  作业一（第3章用户冷启动）：\n",
    "在BX-Books数据集上，根据用户的所在地和年龄特征，进行用户冷启动，实现图书推荐，并在验证集上得到推荐的准确率和召回率\n",
    "注意：\n",
    "1. 使用_pickle保存中间结果\n",
    "2. 用户所在地划分精度，年龄特征分段"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from collections import defaultdict\n",
    "import _pickle as cPickle"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 读取数据-两张表\n",
    "- 用户表\n",
    "- 打分表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>User-ID</th>\n",
       "      <th>Location</th>\n",
       "      <th>Age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>nyc, new york, usa</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>stockton, california, usa</td>\n",
       "      <td>18.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>moscow, yukon territory, russia</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>porto, v.n.gaia, portugal</td>\n",
       "      <td>17.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>farnborough, hants, united kingdom</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   User-ID                            Location   Age\n",
       "0        1                  nyc, new york, usa   NaN\n",
       "1        2           stockton, california, usa  18.0\n",
       "2        3     moscow, yukon territory, russia   NaN\n",
       "3        4           porto, v.n.gaia, portugal  17.0\n",
       "4        5  farnborough, hants, united kingdom   NaN"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_data=pd.read_csv('data/BX-Users.csv',sep=';')\n",
    "user_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>location</th>\n",
       "      <th>age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>nyc, new york, usa</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>stockton, california, usa</td>\n",
       "      <td>18.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>moscow, yukon territory, russia</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>porto, v.n.gaia, portugal</td>\n",
       "      <td>17.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>farnborough, hants, united kingdom</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id                            location   age\n",
       "0        1                  nyc, new york, usa   NaN\n",
       "1        2           stockton, california, usa  18.0\n",
       "2        3     moscow, yukon territory, russia   NaN\n",
       "3        4           porto, v.n.gaia, portugal  17.0\n",
       "4        5  farnborough, hants, united kingdom   NaN"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_data.columns=['user_id','location','age']\n",
    "user_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 278858 entries, 0 to 278857\n",
      "Data columns (total 3 columns):\n",
      " #   Column    Non-Null Count   Dtype  \n",
      "---  ------    --------------   -----  \n",
      " 0   user_id   278858 non-null  int64  \n",
      " 1   location  278858 non-null  object \n",
      " 2   age       168096 non-null  float64\n",
      "dtypes: float64(1), int64(1), object(1)\n",
      "memory usage: 6.4+ MB\n"
     ]
    }
   ],
   "source": [
    "user_data.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### location 只保留国家"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_data['location']=user_data['location'].apply(\n",
    "    lambda location:location.split(',')[-1].strip())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "usa                           139711\n",
       "canada                         21658\n",
       "united kingdom                 18538\n",
       "germany                        17043\n",
       "spain                          13147\n",
       "                               ...  \n",
       "libya\"                             1\n",
       "people`s republic of china         1\n",
       "rsa                                1\n",
       "morocco\"                           1\n",
       "hungary and usa                    1\n",
       "Name: location, Length: 709, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_data['location'].value_counts().sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### 我觉得这种工程思维真的很好，解决问题，而不是想办法让自己显得多厉害，感觉挺好的"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "top10_location=user_data['location'].value_counts().sort_values(ascending=False)[:11].index\n",
    "top10_location=['usa', 'canada', 'united kingdom', 'germany', 'spain', 'australia',\n",
    "       'italy', 'france', 'portugal', 'new zealand']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_data['location']=user_data['location'].apply(\n",
    "    lambda location:location if location in top10_location else 'other')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "usa               139711\n",
       "other              35834\n",
       "canada             21658\n",
       "united kingdom     18538\n",
       "germany            17043\n",
       "spain              13147\n",
       "australia          11784\n",
       "italy              11266\n",
       "france              3455\n",
       "portugal            3325\n",
       "new zealand         3097\n",
       "Name: location, dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_data['location'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 缩放特征Age\n",
    "- 年龄分布与缩放\n",
    "    - 0  <= age <= 6 & > 100   -->0: other\n",
    "    - 7  <= age <= 14        -->1: youth\n",
    "    - 15 <= age <= 18        -->2: juvenile\n",
    "    - 19 <= age <= 45        -->3: young\n",
    "    - 46 <= age <= 69        -->4: adult\n",
    "    - 70 <= age <= 100       -->5: old"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>location</th>\n",
       "      <th>age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>usa</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>usa</td>\n",
       "      <td>18.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>portugal</td>\n",
       "      <td>17.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>united kingdom</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id        location   age\n",
       "0        1             usa   NaN\n",
       "1        2             usa  18.0\n",
       "2        3           other   NaN\n",
       "3        4        portugal  17.0\n",
       "4        5  united kingdom   NaN"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### 把年龄里面的NaN值转换成0\n",
    "- 先找到索引\n",
    "- 然后赋值\n",
    "- 把年龄大于100的也转换成0\n",
    "- 把age用cut做"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "G:\\pythonAtt\\Anaconda\\app\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_data['age'][user_data['age'].isna()]=0.0\n",
    "user_data['age'][user_data['age'].isna()].count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "G:\\pythonAtt\\Anaconda\\app\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_data['age'][user_data['age']>=100]=0.0\n",
    "user_data['age'][user_data['age']>=100].count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### pd.cut函数\n",
    "- bins\n",
    "- labels\n",
    "- right\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "278858"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_data['age'].count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_data['age']=pd.cut(\n",
    "    user_data['age'],\n",
    "    bins=[0,7,15,19,46,70,100],\n",
    "    labels=[0,1,2,3,4,5],\n",
    "    right=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3    114276\n",
       "0    112061\n",
       "4     35529\n",
       "2     11700\n",
       "1      3387\n",
       "5      1905\n",
       "Name: age, dtype: int64"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_data['age'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>location</th>\n",
       "      <th>age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>usa</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>usa</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>other</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>portugal</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>united kingdom</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id        location age\n",
       "0        1             usa   0\n",
       "1        2             usa   2\n",
       "2        3           other   0\n",
       "3        4        portugal   2\n",
       "4        5  united kingdom   0"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 278858 entries, 0 to 278857\n",
      "Data columns (total 3 columns):\n",
      " #   Column    Non-Null Count   Dtype   \n",
      "---  ------    --------------   -----   \n",
      " 0   user_id   278858 non-null  int64   \n",
      " 1   location  278858 non-null  object  \n",
      " 2   age       278858 non-null  category\n",
      "dtypes: category(1), int64(1), object(1)\n",
      "memory usage: 4.5+ MB\n"
     ]
    }
   ],
   "source": [
    "user_data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_data['age']=user_data['age'].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 278858 entries, 0 to 278857\n",
      "Data columns (total 3 columns):\n",
      " #   Column    Non-Null Count   Dtype \n",
      "---  ------    --------------   ----- \n",
      " 0   user_id   278858 non-null  int64 \n",
      " 1   location  278858 non-null  object\n",
      " 2   age       278858 non-null  int32 \n",
      "dtypes: int32(1), int64(1), object(1)\n",
      "memory usage: 5.3+ MB\n"
     ]
    }
   ],
   "source": [
    "user_data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "cPickle.dump(\n",
    "    user_data,\n",
    "    open('./data/jxj_user_data.pkl','wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "item_data=pd.read_csv('data/BX-Book-Ratings.csv',sep=';')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>item_id</th>\n",
       "      <th>rating</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>276725</td>\n",
       "      <td>034545104X</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>276726</td>\n",
       "      <td>0155061224</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>276727</td>\n",
       "      <td>0446520802</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>276729</td>\n",
       "      <td>052165615X</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>276729</td>\n",
       "      <td>0521795028</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id     item_id  rating\n",
       "0   276725  034545104X     0.0\n",
       "1   276726  0155061224     5.0\n",
       "2   276727  0446520802     0.0\n",
       "3   276729  052165615X     3.0\n",
       "4   276729  0521795028     6.0"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "item_data.columns=['user_id','item_id','rating']\n",
    "item_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1149780 entries, 0 to 1149779\n",
      "Data columns (total 3 columns):\n",
      " #   Column   Non-Null Count    Dtype  \n",
      "---  ------   --------------    -----  \n",
      " 0   user_id  1149780 non-null  int64  \n",
      " 1   item_id  1149780 non-null  object \n",
      " 2   rating   1149772 non-null  float64\n",
      "dtypes: float64(1), int64(1), object(1)\n",
      "memory usage: 26.3+ MB\n"
     ]
    }
   ],
   "source": [
    "item_data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>location</th>\n",
       "      <th>age</th>\n",
       "      <th>item_id</th>\n",
       "      <th>rating</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>usa</td>\n",
       "      <td>2</td>\n",
       "      <td>0195153448</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7</td>\n",
       "      <td>usa</td>\n",
       "      <td>0</td>\n",
       "      <td>034542252</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>canada</td>\n",
       "      <td>0</td>\n",
       "      <td>0002005018</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8</td>\n",
       "      <td>canada</td>\n",
       "      <td>0</td>\n",
       "      <td>0060973129</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8</td>\n",
       "      <td>canada</td>\n",
       "      <td>0</td>\n",
       "      <td>0374157065</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id location  age     item_id  rating\n",
       "0        2      usa    2  0195153448     0.0\n",
       "1        7      usa    0   034542252     0.0\n",
       "2        8   canada    0  0002005018     5.0\n",
       "3        8   canada    0  0060973129     0.0\n",
       "4        8   canada    0  0374157065     0.0"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data=pd.merge(user_data,item_data,how='inner',on='user_id')\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "cPickle.dump(data,\n",
    "            open('./data/jxj_data.pkl','wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
